* Construct specialty-year instrument  

	use "${intermediate_data}/RVU/npi_hcpcs_pos_panel.dta", clear	
	merge m:1 hcpcs place_of_service using "${temp}/rvu_values_hcpcs_pos_wide.dta", keep(match) nogen
	merge m:1 npi year using "${clean_data}/panel_physicians_clean.dta", nogen keep(match) keepusing(age nrmp_spec_desc)
	keep if inrange(age,40,55) 
	
	* Compute average number of line items per NRMP specilaty procedure-POS over time 
	* This will be the reference quantity that is time-invariant 
	gegen line_srvc_cnt_mean=mean(line_srvc_cnt), by(npi hcpcs place_of_service)
	keep npi nrmp_spec_desc hcpcs place_of_service rvu_value_hcpcs_pos2* line_srvc_cnt_mean
	gduplicates drop 
	gisid npi hcpcs place_of_service
	
	* Quantity-invariant RVUs that only vary due to policy changes 
	
	forvalues year=2003/2019 {
		gen rvu`year'_atfixq_hcpcs_pos=rvu_value_hcpcs_pos`year'*line_srvc_cnt_mean
	}
	
	* Add-up RVUs at fixed quantities to the npi-year level

	forvalues year=2003/2019 {
		gegen rvu`year'_atfixq_doc_yr=total(rvu`year'_atfixq_hcpcs_pos), by(npi)
	}
	
	* Take the mean at the speacialty-year level
	
	forvalues year=2003/2019 {
		gegen rvu`year'_atfixq_spec_yr=mean(rvu`year'_atfixq_doc_yr), by(nrmp_spec_desc)
	}
			
	keep nrmp_spec_desc *_spec_yr
	gduplicates drop
	reshape long rvu@_atfixq_spec_yr, i(nrmp_spec_desc) j(year)
	save "${intermediate_data}/spec_choice/rvus_instrument_nrmp_spec.dta", replace

* Specialty choice model

	frames reset
	
	use "${intermediate_data}/NRMP/specialty_applicant.dta", clear
	
	keep year nrmp_spec_desc n_dist_match_sn_* n_dist_no_match_sn_*
	
	* Aggregate specialties to balance the data and remove very small data cells
	
	replace nrmp_spec_desc="Surgery" if nrmp_spec_desc=="Vascular Surgery"
	replace nrmp_spec_desc="Surgery" if nrmp_spec_desc=="Neurological Surgery"
	replace nrmp_spec_desc="Surgery" if nrmp_spec_desc=="Otolaryngology"
	replace nrmp_spec_desc="Pediatrics" if nrmp_spec_desc=="Child Neurology"
	replace nrmp_spec_desc="Pediatrics" if nrmp_spec_desc=="Internal Medicine/Pediatrics"
	replace nrmp_spec_desc="Internal Medicine" if nrmp_spec_desc=="Neurology"
	
	collapse (sum) n_*, by (nrmp_spec_desc year)
	
	reshape long n_dist_match_sn_ n_dist_no_match_sn_, ///
		i(year nrmp_spec_desc) j(score_group)
		
	* Add up score groups with scores below 180 and 180-190
	* to decrease the number of small cells
	
	replace score_group=190 if score_group==180
	replace score_group=270 if score_group==300
	collapse (sum) n_dist_match_sn_ n_dist_no_match_sn_, by(year nrmp_spec_desc score_group)
	
	rename (n_dist_match_sn_  n_dist_no_match_sn_) (number_matched number_didntmatch)
	
	* Address zero shares
	
	replace number_didntmatch=1 if number_didntmatch==0
	
	egen number_applicants=rowtotal(number_matched number_didntmatch)	
	egen number_graduates=total(number_applicants), by(year score_group)
	
	gen share_in_spec=number_applicants/number_graduates
		
	* Merge in characteristics of specialties 

	merge m:1 nrmp_spec_desc year using "${intermediate_data}/specialty_characteristics/nrmp_spec_characteristics_age4055.dta", keep(match) nogen keepusing(nrmp_spec_code ptotinc wkh sd_ptotinc einsize female freq)
	merge m:1 nrmp_spec_desc year using "${intermediate_data}/spec_choice/rvus_instrument_nrmp_spec.dta", keep(master match) nogen 

	ren freq N_UR
	gen hourly_income=ptotinc/(52*wkh)
	gen hourly_rvus=rvu_atfixq_spec_yr/(52*wkh)
	drop ptotinc wkh 
	replace sd_ptotinc=sd_ptotinc/1000
	rename (hourly_income sd_ptotinc einsize female) (mean_hourly_income mean_sd_ptotinc mean_einsize share_female)
	keep year nrmp_spec_desc nrmp_spec_code score_group ///
		number_graduates number_applicants share_in_spec  hourly_rvus ///
		mean_hourly_income mean_sd_ptotinc mean_einsize share_female N_UR	
		
	local features share_in_spec mean_hourly_income hourly_rvus mean_sd_ptotinc mean_einsize share_female
	
	foreach feature in `features' {
	gen x_oo_`feature'=`feature' if nrmp_spec_desc=="Family Medicine"
	egen oo_`feature'=max(x_oo_`feature'), by(year score_group)
	}	
	drop x_*
	
	* Generate difference in log shares
	gen LogShareDiff=log(share_in_spec)-log(oo_share_in_spec)
	
	* Generate differenced features
	local features mean_hourly_income hourly_rvus mean_sd_ptotinc mean_einsize share_female 
	
	foreach feature in `features' {
	gen d_`feature'=`feature'-oo_`feature'
	}
	
	* Cross-check that shares add up to 1
	 egen shares_check=total(share_in_spec), by(score_group year)
	 assert shares_check==1
	  
	* Manual first stage for one score group
	gen d_hourly_inc_270 = d_mean_hourly_income * 270.score_group
	reg d_hourly_inc_270 c.d_hourly_rvus#score_group d_hourly_rvus i.score_group d_mean_sd_ptotinc d_mean_einsize d_share_female ib1.nrmp_spec_code if nrmp_spec_code!=4, robust
	
	preserve
		parmest, norestore stars(0.10 0.05 0.01) es(N) 
		g depvar = "d_hourly_inc_270"
		gen SEtype = "robust"
		tempfile fs
		save `fs', replace
	restore
	
	reg d_hourly_inc_270 c.d_hourly_rvus#score_group d_hourly_rvus i.score_group d_mean_sd_ptotinc d_mean_einsize d_share_female ib1.nrmp_spec_code if nrmp_spec_code!=4, vce(cluster nrmp_spec_code)
	
	preserve
		parmest, norestore stars(0.10 0.05 0.01) es(N) 
		g depvar = "d_hourly_inc_270"
		gen SEtype = "cluster"
		tempfile fsClust
		save `fsClust', replace
	restore
	
	ivregress 2sls LogShareDiff (c.d_mean_hourly_income#score_group d_mean_hourly_income = c.d_hourly_rvus#score_group d_hourly_rvus) i.score_group d_mean_sd_ptotinc d_mean_einsize d_share_female ib1.nrmp_spec_code if nrmp_spec_code!=4, robust
	parmest, saving("${output}/spec_choice/estimates-01-group_level_model_2SLS.dta", replace) stars(0.10 0.05 0.01) es(N)

	forvalues s=200(10)270 {
		margins, dydx(d_mean_hourly_income) at(score_group=`s')
		matrix marg`s' = r(b)	
		matrix varDelta`s' = r(V)

		local marg`s' = marg`s'[1,1]
		local seDelta`s' = sqrt(varDelta`s'[1,1])
	
		di "Marginal effect and delta-method SE for score group `s':"
		di `marg`s''
		di `seDelta`s''
	}
	
	ivregress 2sls LogShareDiff (c.d_mean_hourly_income#score_group d_mean_hourly_income = c.d_hourly_rvus#score_group d_hourly_rvus) i.score_group d_mean_sd_ptotinc d_mean_einsize d_share_female ib1.nrmp_spec_code if nrmp_spec_code!=4, vce(cluster nrmp_spec_code)
	parmest, saving("${output}/spec_choice/estimates-01-group_level_clustered_2SLS.dta", replace) stars(0.10 0.05 0.01) es(N)
	
	forvalues s=200(10)270 {
		margins, dydx(d_mean_hourly_income) at(score_group=`s')
		matrix margClust`s' = r(b)	
		matrix varClust`s' = r(V)

		local margClust`s' = margClust`s'[1,1]
		local seClust`s' = sqrt(varClust`s'[1,1])
	
		di "Marginal effect and delta-method clustered SE for score group `s':"
		di `margClust`s''
		di `seClust`s''
	}

	frame create aggreg_est
		frame aggreg_est {
		
		use "$output/spec_choice/estimates-01-group_level_model_2SLS.dta", clear
		gen SEtype = "robust"
		gen marg = .
		gen seDelta = .
		
		forvalues s=200(10)270 {
			replace marg = `marg`s'' if substr(parm, 1, 3) == "`s'" & strpos(parm,"d_mean_hourly_income") > 0
			replace seDelta = `seDelta`s'' if substr(parm, 1, 3) == "`s'" & strpos(parm,"d_mean_hourly_income") > 0
		}
		
		append using "${output}/spec_choice/estimates-01-group_level_clustered_2SLS.dta"
		replace SEtype = "cluster" if mi(SEtype)
		forvalues s=200(10)270 {
			replace marg = `margClust`s'' if SEtype == "cluster" & substr(parm, 1, 3) == "`s'" & strpos(parm,"d_mean_hourly_income") > 0
			replace seDelta = `seClust`s'' if SEtype == "cluster" & substr(parm, 1, 3) == "`s'" & strpos(parm,"d_mean_hourly_income") > 0
		}

		g depvar = "LogShareDiff"
		append using `fs'
		append using `fsClust'

		
	} 

	* Export and format estimates

	labmask nrmp_spec_code, values(nrmp_spec_desc)
	local lbe : value label nrmp_spec_code
	levelsof nrmp_spec_code, loc(values)
	
	foreach i of num `values' {
		local lab`i' : label `lbe' `i'
		frame aggreg_est: replace parm = "`lab`i''" if substr(parm, 1, strpos(parm, ".")-1) == "`i'" & regexm(parm,".nrmp_spec_code")
	} 
	
	frame aggreg_est {
		keep parm estimate SEtype stderr stars es_1 depvar marg seDelta
		ren es_1 N_UR
		drbcount N_UR , g(N)
		drbest_docinc estimate stderr marg seDelta
		order N_UR, last

		replace SEtype = "Cluster" if SEtype == "cluster"
		replace SEtype = "Robust" if SEtype == "robust"

		reshape clear
		reshape i parm depvar
		reshape j SEtype, str
		reshape xi estimate marg N N_UR
		reshape xij stderr seDelta stars
		reshape wide

		sort depvar parm

		export delimited using "${mypath}/intermediate_csv/estimates-01-group_level_model_2SLS.csv", replace dataf delim(tab) 
	}

* Counterfactuals 

	local features mean_hourly_income mean_sd_ptotinc mean_einsize share_female 
	
	foreach feature in `features' {
		replace d_`feature'=`feature'
	}
 
	predict obs_util, xb
	predict xi, residuals
	gen delta_sta=obs_util+xi
	
	gen share_in_spec_pred_numerator=exp(delta_sta)
	egen share_in_spec_pred_denominator=sum(share_in_spec_pred_numerator), by(year score_group)	
	gen share_in_spec_pred=share_in_spec_pred_numerator/share_in_spec_pred_denominator
	
	* Pay Internal Medicine same as Dermatology

	local counterfactual "Income changes (2SLS)"
	
	gen x_cntf_inc=mean_hourly_income if nrmp_spec_desc=="Dermatology"
	egen cntf_inc=max(x_cntf_inc), by(year score_group)
	replace d_mean_hourly_income=cntf_inc if nrmp_spec_desc=="Internal Medicine"
	
	predict obs_util_IMsameasDerm, xb
	gen delta_sta_IMsameasDerm=obs_util_IMsameasDerm+xi
	
	gen share_IMsameasDerm_numerator=exp(delta_sta_IMsameasDerm)
	egen share_IMsameasDerm_denominator=sum(share_IMsameasDerm_numerator), by(year score_group)	
	gen share_IMsameasDerm=share_IMsameasDerm_numerator/share_IMsameasDerm_denominator
	
	* Cross-check that all predicted shares add up to 1
	
	foreach pred in share_in_spec_pred share_IMsameasDerm {
	 egen `pred'_check=total(`pred'), by(year score_group)
	 assert `pred'_check>0.99 & `pred'_check<1.01
	}
	
	drop *check 

	preserve 
		use "${output}/spec_choice/estimates-01-group_level_model_2SLS.dta", clear
		qui su estimate if parm == "d_mean_hourly_income"
		replace estimate = estimate + `r(mean)'
		gen score_group = substr(parm, 1,3) 
		destring score_group, replace force
		keep if  strpos(parm,"group#c") > 0
		keep estimate score_group
		tempfile estimates
		save `estimates'
	restore
	
* Own- and cross- income elasticities
	
	merge m:1 score_group using `estimates', nogen assert(3)
	
	levelsof nrmp_spec_code, loc(nrmp)
		
	foreach spec of local nrmp {
		gen elasticity`spec' = .
	
		foreach score of num 190(10)270 {
			
			// Compute own/crossincome elasticities
			replace elasticity`spec' = (-1)*estimate * mean_hourly_income * share_in_spec if nrmp_spec_code != `spec' & score_group == `score' // j!=k
			replace elasticity`spec' = estimate * mean_hourly_income * (1-share_in_spec) if nrmp_spec_code == `spec' & score_group == `score' // j=k
		}
	}
	
	drop estimate 
	
* Counts and shares for output

	egen count_anygrads=total(number_applicants), by(year)
	egen count_byscore=total(number_applicants), by(score_group year)
	egen count_byspeccat=total(number_applicants), by(nrmp_spec_desc year)
	gen count_byscore_byspeccat=number_applicants
	gen share_score=count_byscore/count_anygrads
	gen share_speccat=count_byspeccat/count_anygrads
	gen share_in_score_byspec=count_byscore_byspeccat/count_byspeccat
	gen share_in_spec_byscore=count_byscore_byspeccat/count_byscore	
	
	assert share_in_spec_byscore==share_in_spec
	
	drop share_in_spec share_female

	* Save data	
	
	drop *numerator *denominator
	keep year nrmp_spec_code nrmp_spec_desc score_group mean_hourly_income ///
		count_* share* elasticity* N_UR
	
	order year score_group nrmp_spec_code nrmp_spec_desc mean_hourly_income ///
		count_anygrads count_byscore count_byspeccat count_byscore_byspeccat ///
		share_score share_speccat share_in_score_byspec share_in_spec_byscore ///
		share_in_spec_pred share_IMsameasDerm elasticity*
	sort year score_group nrmp_spec_code 	
	isid year score_group nrmp_spec_code
	
	* Re-check that all shares add up to 1 before saving
	
	foreach pred in share_in_spec_byscore share_in_spec_pred share_IMsameasDerm {
	 egen `pred'_check=total(`pred'), by(year score_group)
	 assert `pred'_check>0.99 & `pred'_check<1.01
	}
	
	foreach pred in share_in_score_byspec {
	 egen `pred'_check=total(`pred'), by(year nrmp_spec_desc)
	 assert `pred'_check>0.99 & `pred'_check<1.01
	}
	drop *check 
	
	save "${output}/spec_choice/estimates-02-group_level_cntf_and_elast_2SLS.dta", replace
	
	* Adjust data for review bord disclosure rules and export
	keep if year==2016
	label val nrmp_spec_code
	loc varlist = ""
	loc countlist = ""
	
	foreach v of varl elasticity* {
		loc varlist "`varlist' `v'" 
	}			
	
	drbest_docinc `varlist' mean_hourly_income
	drbcount N_UR, g(N)
	order year score_group nrmp* mean_hourly* elasticity* N N_UR 
	export delimited using "${mypath}/intermediate_csv/estimates-02-group_level_cntf_and_elast_2SLS.csv", replace dataf delim(tab) 